import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import pandas_profiling
import seaborn as sns # Import data visualization library for statistical graphics
import matplotlib.pyplot as plt # Import data visualization library
# ====== For Linear Regression ======
# from scipy.stats import zscore, pearsonr
# from sklearn.preprocessing import StandardScaler, MinMaxScaler
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import statsmodels.api as sm
import pickle
from yellowbrick.regressor import ResidualsPlot
# ====== For Logistic Regression ======
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_curve, roc_auc_score, accuracy_score, classification_report
# ======= Set default style ========
### Multiple output displays per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>")) # Increase cell width
pd.options.display.float_format = '{:,.2f}'.format # Remove scientific notations to display numbers with 2 decimals
plt.figure(figsize=(12,8))
sns.set_style(style='darkgrid')
%matplotlib inline
# ===== Options =====
# Increase max number of rows and columns to display in pandas tables
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 20)
# Update default style and size of charts
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [10, 8]
ldf = pd.read_csv('Bank_Personal_Loan_Modelling.csv')
print('* DataF Shape:', ldf.shape)
print('\n* DataF Info:\n')
ldf.info()
print('\n* DataF Describe/Ststs:')
ldf.describe()
print('\n* DataF null values:')
ldf.isna().sum()
print('\n* DataF top 5 rows:')
ldf.head()
isr = np.isreal(ldf).sum() # number of rows and columns (cells) having only real number (numeric) in dataframe ldf
trc = ldf.shape[0] * ldf.shape[1] # number of ALL rows and columns (cells) in the dataframe ldf
print('\n* Number of cells (rows,columns) having non numeric data:', trc - isr)
print('\n* Columns having -ve values:')
(ldf<0).sum()
# From the above, we see that column "Experience" has 52 -ve negative values:
print('\n* 52 Row where Professional "Experience" column have -ve values:') # print 52 such rows
print('* These are Incorrect values or improper Imputations because it cant be -ve:') # print such rows
ldf[ldf.Experience < 0]
Number of unique in each column? Number of people with zero mortgage? Number of people with zero credit card spending per month? Value counts of all categorical columns. Univariate and Bivariate Get data model ready
# In case you mess up and need to step back, make an Incremental backup copy of the current dataset df:
ldf0 = ldf.copy() # Make backup 0: Original dataset
# Drop columns: "ID" (all unique) and "ZIP Code" (mostly unique) which wont contribute value to a better model.
ldf.head() # Before drop
ldf.drop(columns=['ID', 'ZIP Code'], inplace=True)
ldf.head() # After drop
# Rename column names for convenience: Convert to lower case:
ldf.rename(str.lower, axis='columns', inplace=True)
# Rename column names for convenience: Shorten / abbreviate them
ldf.rename(columns={'experience': 'expr', 'family': 'fml', 'education': 'edu', 'mortgage': 'mrtg',
'personal loan': 'ploan', 'securities account': 'secac', 'cd account': 'cdac',
'creditcard': 'ccrd'}, inplace=True, errors='raise')
ldf.head() # Looks good!
ldf1 = ldf.copy() # Make backup 1: After: 1. Dropped: ID, ZipCode; 2. Columns names a:Shortened, b:lowercased
type(ldf1)
# Imputation Needed: Professional Experience cannot be a -ve value.
# This "expr" column has 52 -ve values (incorrect), as we have seen above.
# We will inpute such "expr" value with the column's Absolute Value:
ldf[ldf.expr<0] # Before Imputation: 52 rows returned
ldf.expr[ldf.expr<0] = abs(ldf.expr[ldf.expr<0]) # impute "expr" -ve value with its Absolute value
ldf[ldf.expr<0] # After Imputation: zero row returned, i.e. no rows with -ve "expr" values
print('\nPost Imptutation: Rows with -ve Experience "expr" values:', (ldf.expr<0).sum())
# Take a backup of ldf to preserve the work so far, in case you mess up, you wont have to start from the beginning!
ldf2 = ldf.copy() # Make backup 2: After: "expr" -ve values imputed to abs("expt") Absolute values
type(ldf2)
ldf2.head()
print('* Unique values in each column:')
ldf.nunique()
print('\n* People with zero mortgage:', (ldf.mrtg==0).sum())
print('\n* People with zero Credit Card Spending per Month:', (ldf.ccavg==0).sum())
print('\n* Value Counts of all Categorical Columns: See Below:')
ldf[['fml','edu','ploan','secac','cdac','online','ccrd']].apply(pd.value_counts)
print('\n* Value Counts tabulated above^, are same as if taken separately for each Column')
profile = pandas_profiling.ProfileReport(ldf)
profile
# There are 13 Duplicate Rows as indicated by the Profile report above
# Identifing duplicate row below:
print('\n* Duplicate Rows Found:', ldf.duplicated().sum())
print('\n* Duplicate Rows listed below:')
ldf[ldf.duplicated()]
print('\n* Duplicate Rows Sample Pairs listed below:')
ldf[ldf.duplicated(keep=False)].sort_values(by=['age']).head(10)
# Take backup before dup row drop:
ldf3 = ldf.copy()
type(ldf3)
ldf3.head()
len(ldf3)
# Remove these duplicate row:
print('\n* Rows Before dup drop:', len(ldf3)) # backed up ldf before this operation
ldf.drop_duplicates(keep='first', inplace=True)
print('\n* Rows After dup drop:', len(ldf)) # rows after dup drop
print('\n* Rows dropped:', len(ldf3)-len(ldf), 'List dup rows below: None / No Dups:')
ldf[ldf.duplicated()]
print('\n* Number of Duplicate Row after Drop:', int(ldf[ldf.duplicated()].sum().sum()))
# Bank Relation/Liability: Has ones or more Deposit A/Cs: Mortgage, Securities/Stocks, CD/FD, Credit Card, Online A/c, etc.
# Bank Relation/Asset: Has one or more of Loan A/Cs: Personal Loan, etc.
# Bank's Object: Predict likelihood of buying a Personal Loan by a Liability Customer.
# Type A: These people do not have any relationship with the bank and didnt take any P.Loan: No Asset & No Liability:
noac = len(ldf[((ldf.mrtg + ldf.ploan + ldf.secac + ldf.cdac + ldf.online + ldf.ccrd) < 1)])
# Type B: These people dont have any Liability A/Cs (Deposits), but have Asset A/C (P.Loan):
plonly = len(ldf[ ( (ldf.mrtg + ldf.secac + ldf.cdac + ldf.online + ldf.ccrd) < 1) & (ldf.ploan > 0) ] )
# Type C: These people dont have any Asset A/Cs (P.Loan), but have Liability A/C (Deposit):
dponly = len(ldf[ ( (ldf.mrtg + ldf.secac + ldf.cdac + ldf.online + ldf.ccrd) > 0) & (ldf.ploan < 1) ])
print('\n* Type A: No Relation; No A/Cs:', noac, '= ("Noise", These Rows To Be Deleted)')
print('\n* Type B: Asset/P.Loan Only; No Deposits:', plonly, '= (NOT a Banks target for this campaign)')
print('\n* Type C: Liability/Deposit Only; No Loans:',dponly, '= (Banks TARGET for this campaign): Sample Data: See Below:')
ldf[ ( (ldf.mrtg + ldf.secac + ldf.cdac + ldf.online + ldf.ccrd) > 0) & (ldf.ploan < 1) ]
# Prep: Remove "Type A" "Noise" rows: Customers with NO Relationship with bank i.e. No Liability and NO Personal Loans A/Cs:
# Take a backup
ldf4 = ldf.copy()
type(ldf4)
len(ldf4), len(ldf4)
ldf4.head()
# Prep: Remove "Type A" "Noise" rows:
print('\n* Number of Type A rows to be removed:', noac)
print('\n* Sample of Type A rows to be removed:',)
ldf[((ldf.mrtg + ldf.ploan + ldf.secac + ldf.cdac + ldf.online + ldf.ccrd) < 1)]
# Remove / Filter out "Type A" "Noise" rows:
# ldf = ldf[~((ldf.mrtg + ldf.ploan + ldf.secac + ldf.cdac + ldf.online + ldf.ccrd) < 1)]
ldf[((ldf.mrtg + ldf.ploan + ldf.secac + ldf.cdac + ldf.online + ldf.ccrd) < 1)]
print('\n* Type A rows removed / filtered: No "Type A" Rows present Above^')
ldf5 = ldf.copy() # Take a incremental backup of the Dataset processed so far
# Convert Category variable to type "Category":
ldf.info()
ldf.fml = ldf.fml.astype('category')
ldf.edu = ldf.edu.astype('category')
ldf.ploan = ldf.ploan.astype('category')
ldf.secac = ldf.secac.astype('category')
ldf.cdac = ldf.cdac.astype('category')
ldf.online = ldf.online.astype('category')
ldf.ccrd = ldf.ccrd.astype('category')
ldf.info()
# Take a backup of processed df so far:
ldf6 = ldf.copy()
type(ldf6)
len(ldf), len(ldf6)
ldf6.sample(6)
# Create dummies
ldf.head()
ldf = pd.get_dummies(ldf, columns=['fml','edu'])
ldf.head()
ldf.info()
# Take a backup of processed df so far:
ldf7 = ldf.copy()
type(ldf7)
len(ldf), len(ldf7)
ldf7.sample(7)
# DO: UniVariate BiVariate analysis
sns.pairplot(ldf5)
# Get Correlation coefficients:
ldf5.corr()
sns.heatmap(ldf5.corr(), annot=True) # plot the correlation coefficients as a heatmap
Comments / Justification for Experience column "expr": Since Experience "expr" is strongly correlated to Age "age" either can be derived/predicted/estimated from each other. Hence only either "age" or "expr" is needed to train the model. However, it does makes a difference from analysis that whether "expr" is or none i.e. "expr" is zero or > zero. Hence instead of removing "expr" column completely, we derive a Categorical feature out of it.
Since what matters is whether the customer HAS Experience or NO i.e. Experience = Yes or No, we will code this column with "0" or "1". "NO Experience" = "0" (expr<1) and "SOME Experience" = "1" (expr>0). Then change "expr" column datatype to "Category".
This will eliminate the redundunt and unneeded correlation between "age" and "expr" and will provide some meaning value (expr = yes / no) for model training.
# Take a backup of processed df so far:
ldf8 = ldf.copy()
type(ldf8)
len(ldf8), len(ldf) # compare
ldf8.sample(8) # check
# As per above stated Comments / Justifications: Reduce and create feature out of column "expr"
ldf.expr[ldf.expr > 0] = 1
ldf.expr.value_counts()
ldf.sample(6)
sns.heatmap(ldf.corr(), annot=True)
As we can see from above heatmap that the redundent and unneeded correlation between "age" and "expr" has been eliminated, BUT, yet the same time we have meaningfully retained and factored in the importance / value of Experience "expr" via feature encoding / engineering.
# Change the datatype of the newly encoded/engineered column "expr" from "int64" to "category"
ldf.expr = ldf.expr.astype('category')
ldf.info()
# Take a backup of processed df so far:
ldf9 = ldf.copy()
type(ldf9), len(ldf9), len(ldf) # compare with current/source working copy of df
ldf9.sample(9) # check
# Histogram of 4 non Categorical columns
fig, axs = plt.subplots(ncols = 4, figsize = (30, 10))
sns.distplot(ldf.age, ax = axs[0])
sns.distplot(ldf.income, ax = axs[1])
sns.distplot(ldf.ccavg, ax = axs[2])
sns.distplot(ldf.mrtg, ax = axs[3])
cols = ['age','income','ccavg','mrtg'] # Select the 4 non Categorical columns
ldf[cols].hist(stacked=False, bins=100, figsize=(12,30), layout=(14,2));
# Count Plot of all Categorical columns: Visual value counts of all columns of type 'category'
fig, axs = plt.subplots(ncols = 6, figsize = (30, 10))
j = 0
for i in list(ldf.columns[ldf.dtypes == 'category']):
plt.figure(figsize = (4,2))
sns.countplot(data = ldf, x = i, ax = axs[j] )
j = j+1
fig, axs = plt.subplots(ncols = 7, figsize = (30, 10))
j = 0
for i in list(ldf.columns[ldf.dtypes == 'uint8']):
plt.figure(figsize = (4,2))
sns.countplot(data = ldf, x = i, ax = axs[j] )
j = j+1
# Split the data into Train and Test sets of 70:30 ratio:
# Define X and Y variables:
X = ldf.drop('ploan',axis=1) # Predictor feature columns
Y = ldf['ploan'] # Predicted class (1 = True, 0 = False)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=3)
# Verify the data split is in the Required Proportion of 70:30 ratio:
ttl = len(X), len(Y)
trn = len(X_train), len(y_train)
tst = len(X_test), len(y_test)
print('\n* Data split into 70:30 Ratio as Required: See Below:')
print('\n* TRAIN dataset %rows',round(trn[0]*100/ttl[0],2),'%')
print('* TEST dataset %rows',round(tst[0]*100/ttl[0],2),'%')
# Calc baseline proportion of the Predictor ("ploan") : Data Imbalance: Ratio of Yes ("1") to No ("0")
Yp = ldf['ploan'].value_counts(normalize=True)
print('\n* There is some Class (ploan) Imbalance: 0 = No; 1 = Yes')
print(Yp)
# Calc & Compare Percentages % :
print('\n* Percentage of True/False values of Predictor Class P.Loan:\n')
print("Original P.loan True Values : {0} ({1:0.2f}%)".format(len(ldf.loc[ldf['ploan'] == 1]), (len(ldf.loc[ldf['ploan'] == 1])/len(ldf.index)) * 100))
print("Original P.loan False Values : {0} ({1:0.2f}%)".format(len(ldf.loc[ldf['ploan'] == 0]), (len(ldf.loc[ldf['ploan'] == 0])/len(ldf.index)) * 100))
print("")
print("Training P.loan True Values : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train)) * 100))
print("Training P.loan False Values : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train)) * 100))
print("")
print("Test P.loan True Values : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test)) * 100))
print("Test P.loan False Values : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test)) * 100))
print("\n* Percentages seem uniform among Origina, Train, Test datasets")
# Build Logistic Regression Model:
# Fit the model on Train
model2 = LogisticRegression(solver="liblinear", penalty='l2', random_state=3)
model2.fit(X_train, y_train)
# Predict on test
y_predict = model2.predict(X_test)
coef_df = pd.DataFrame(model2.coef_)
coef_df['intercept'] = model2.intercept_
print(coef_df)
# Get the Accuracy (Score) of the Model against Training Data
accScore = model2.score(X_test, y_test)
print("Model2 Score = %f" %(accScore))
# Build the Confusion Matrix:
cm = metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
cm
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
df_cm
plt.figure(figsize = (11,7))
sns.heatmap(df_cm, annot=True, fmt="d", square=True)
plt.ylabel('Actual')
plt.xlabel('Predicted')
The confusion matrix Colclusion:
True Positives (TP): Correct Prediction: These customers WILL buy P.Loan = 119
True Negatives (TN): Correct Prediction: These customers WILL NOT buy P.Loan = 1088
False Positives (FP): Incorrect Prediction: These customers WILL buy P.Loan = 11 ("Type I" error)
False Negatives (FN): Incorrect Prediction: These customers WILL NOT buy P.Loan = 39 ("Type II" error)
# Use Penalty "l1" to improve the model:
# Build Logistic Regression Model:
# Fit the model on Train
model1 = LogisticRegression(solver="liblinear", penalty='l1', random_state=3)
model1.fit(X_train, y_train)
# Predict on test
y_predict = model1.predict(X_test)
coef_df = pd.DataFrame(model1.coef_)
coef_df['intercept'] = model1.intercept_
print(coef_df)
# Get the Accuracy (Score) of the Model against Training Data
accScore = model1.score(X_test, y_test)
print("Model2 Score = %f" %(accScore))
Accuracy = (117+1088)/(117+1088+11+41)
print("Accuracy = %f" %Accuracy)
# Get Other Metrics:
df_pred = pd.DataFrame(y_predict)
print("Recall:",recall_score(y_test,df_pred))
print("Precision:",precision_score(y_test,df_pred))
print("F1 Score:",f1_score(y_test,df_pred))
print("Roc Auc Score:",roc_auc_score(y_test,df_pred))
# Generate AUC ROC curves
lg_roc_auc = roc_auc_score(y_test, model1.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, model1.predict_proba(X_test)[:,1])
lg_roc_auc
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % lg_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
# Model Improvement: Hyper Parameter Tuning:
# Get Params of logistic regression
model1.get_params()
# Loop thru various "solver" to check diff values
# solver can only be used with l2, But 'liblinear' works with both 'l1' and 'l2'
train_score=[]
test_score=[]
solver = ['newton-cg','lbfgs','liblinear','sag','saga']
for i in solver:
model = LogisticRegression(random_state=42, penalty='l2', C = 0.75, solver=i)
model_fit = model.fit(X_train, y_train)
y_predict = model.predict(X_test)
train_score.append(round(model.score(X_train, y_train),3))
test_score.append(round(model.score(X_test, y_test),3))
print(solver)
print(train_score)
print(test_score)
model = LogisticRegression(random_state=42, penalty='l1', solver='saga') # changing penalty to l1
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print("Trainig accuracy",model.score(X_train,y_train))
print("Testing accuracy",model.score(X_test, y_test))
model = LogisticRegression(random_state=42, penalty='l1', solver='liblinear') # changing penalty to l1
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print("Trainig accuracy",model.score(X_train,y_train))
print("Testing accuracy",model.score(X_test, y_test))
model = LogisticRegression(random_state=42, solver='liblinear', penalty='l1',class_weight='balanced') # changing class weight to balanced
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print("Trainig accuracy",model.score(X_train,y_train))
print("Testing accuracy",model.score(X_test, y_test))
# Loop to check diff Threshold values of 'C'
train_score=[]
test_score=[]
C = [0.01,0.1,0.25,0.5,0.75,1]
for i in C:
model = LogisticRegression(random_state=42, solver='liblinear', penalty='l1', class_weight='balanced', C=i) # changing values of C
model_fit=model.fit(X_train, y_train)
y_predict = model.predict(X_test)
train_score.append(round(model.score(X_train,y_train),3)) # appending training accuracy in a blank list for every run of the loop
test_score.append(round(model.score(X_test, y_test),3)) # appending testing accuracy in a blank list for every run of the loop
print(C)
print(train_score)
print(test_score)
# Hence the Final / Best model is:
model = LogisticRegression(random_state=42, solver='liblinear', penalty='l1', class_weight='balanced',C=0.25)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_predict, labels=[1, 0])
# cm
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
# df_cm
plt.figure(figsize = (11,7))
sns.heatmap(df_cm, annot=True, fmt="d", square=True)
plt.ylabel('Actual')
plt.xlabel('Predicted')
print('Here is The FINAL / BEST Model:')
print()
print("Trainig accuracy",model.score(X_train,y_train))
print()
print("Testing accuracy",model.score(X_test, y_test))
print()
print("Recall:",recall_score(y_test,y_predict))
print()
print("Precision:",precision_score(y_test,y_predict))
print()
print("F1 Score:",f1_score(y_test,y_predict))
print()
print("Roc Auc Score:",roc_auc_score(y_test,y_predict))
print()
print('Confusion Matrix:')